titanic <- read_csv("titanic.csv")
visdat::vis_dat(titanic, palette = "cb_safe")+
labs(title = "Raw Data Visualization")
# extract the title
t <- titanic %>%
extract(Name, into = "Title", reg =' ([A-Za-z]+)\\.')
# combine the high-ranking title
highrank <- filter(t, !Title %in% c("Mr", "Mrs", "Miss", "Ms")) %>%
select(-Title) %>%
cbind(data.frame(Title = c(rep("HighRank",66))))
clean <- filter(t, Title %in% c("Mr", "Mrs", "Miss", "Ms")) %>%
rbind(highrank) %>%
arrange(PassengerId) %>%
select(-Cabin) %>%
na.omit() %>%
select(-Ticket, -PassengerId) %>%
mutate(Pclass = as.factor(Pclass),
Survived = as.logical(Survived))
clean[356, "Title"] = "Miss"
set.seed(5318)
sample <- sample(c(TRUE, FALSE), nrow(clean), replace=TRUE, prob=c(0.8,0.2))
train <- clean[sample, ]
test <- clean[!sample, ]
model1 <- glm(Survived~., data = train, family = "binomial")
summary(model1)
##
## Call:
## glm(formula = Survived ~ ., family = "binomial", data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.6712 -0.5706 -0.3360 0.5577 2.5193
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 16.215065 614.240534 0.026 0.978939
## Pclass2 -1.475395 0.400961 -3.680 0.000234 ***
## Pclass3 -2.649996 0.417732 -6.344 2.24e-10 ***
## TitleMiss -11.605639 614.240465 -0.019 0.984925
## TitleMr -1.565998 0.429437 -3.647 0.000266 ***
## TitleMrs -10.539407 614.240551 -0.017 0.986310
## Sexmale -12.626797 614.240528 -0.021 0.983599
## Age -0.054424 0.010949 -4.971 6.66e-07 ***
## SibSp -0.498806 0.148219 -3.365 0.000765 ***
## Parch -0.197488 0.152584 -1.294 0.195565
## Fare 0.002177 0.002888 0.754 0.450934
## EmbarkedQ -0.811946 0.726763 -1.117 0.263905
## EmbarkedS -0.397705 0.322219 -1.234 0.217103
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 762.00 on 568 degrees of freedom
## Residual deviance: 465.89 on 556 degrees of freedom
## AIC: 491.89
##
## Number of Fisher Scoring iterations: 13
pchisq(762 - 456.89, 568-556, lower.tail = F)
## [1] 3.967624e-58
car::vif(model1)
## GVIF Df GVIF^(1/(2*Df))
## Pclass 2.354714e+00 2 1.238752
## Title 1.043554e+07 3 14.782656
## Sex 6.768920e+06 1 2601.714882
## Age 1.858304e+00 1 1.363196
## SibSp 1.341332e+00 1 1.158159
## Parch 1.410742e+00 1 1.187746
## Fare 1.608805e+00 1 1.268387
## Embarked 1.210050e+00 2 1.048820
model2 <- update(model1, .~. - Sex)
summary(model2)
##
## Call:
## glm(formula = Survived ~ Pclass + Title + Age + SibSp + Parch +
## Fare + Embarked, family = "binomial", data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.6759 -0.5686 -0.3349 0.5597 2.5210
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 3.643551 0.667890 5.455 4.89e-08 ***
## Pclass2 -1.493393 0.400446 -3.729 0.000192 ***
## Pclass3 -2.664176 0.417739 -6.378 1.80e-10 ***
## TitleMiss 0.988152 0.435553 2.269 0.023285 *
## TitleMr -1.605136 0.424125 -3.785 0.000154 ***
## TitleMrs 2.060010 0.502259 4.101 4.11e-05 ***
## Age -0.054614 0.010955 -4.985 6.19e-07 ***
## SibSp -0.504292 0.148097 -3.405 0.000661 ***
## Parch -0.201174 0.152773 -1.317 0.187899
## Fare 0.002197 0.002897 0.758 0.448303
## EmbarkedQ -0.817465 0.726489 -1.125 0.260493
## EmbarkedS -0.396807 0.321876 -1.233 0.217653
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 762.00 on 568 degrees of freedom
## Residual deviance: 466.43 on 557 degrees of freedom
## AIC: 490.43
##
## Number of Fisher Scoring iterations: 5
pchisq(762 - 466.43, 568-557, lower.tail = F)
## [1] 7.509407e-57
car::vif(model2)
## GVIF Df GVIF^(1/(2*Df))
## Pclass 2.360578 2 1.239523
## Title 1.827415 3 1.105706
## Age 1.858486 1 1.363263
## SibSp 1.339123 1 1.157205
## Parch 1.409782 1 1.187342
## Fare 1.610194 1 1.268934
## Embarked 1.209175 2 1.048630
step(model2,direction = "backward", criteria = "AIC")
## Start: AIC=490.43
## Survived ~ Pclass + Title + Age + SibSp + Parch + Fare + Embarked
##
## Df Deviance AIC
## - Embarked 2 468.46 488.46
## - Fare 1 467.05 489.05
## - Parch 1 468.23 490.23
## <none> 466.43 490.43
## - SibSp 1 479.37 501.37
## - Age 1 494.61 516.61
## - Pclass 2 509.32 529.32
## - Title 3 625.52 643.52
##
## Step: AIC=488.46
## Survived ~ Pclass + Title + Age + SibSp + Parch + Fare
##
## Df Deviance AIC
## - Fare 1 469.54 487.54
## - Parch 1 470.12 488.12
## <none> 468.46 488.46
## - SibSp 1 483.07 501.07
## - Age 1 499.30 517.30
## - Pclass 2 515.05 531.05
## - Title 3 627.84 641.84
##
## Step: AIC=487.54
## Survived ~ Pclass + Title + Age + SibSp + Parch
##
## Df Deviance AIC
## - Parch 1 470.75 486.75
## <none> 469.54 487.54
## - SibSp 1 483.62 499.62
## - Age 1 502.34 518.34
## - Pclass 2 560.74 574.74
## - Title 3 629.71 641.71
##
## Step: AIC=486.75
## Survived ~ Pclass + Title + Age + SibSp
##
## Df Deviance AIC
## <none> 470.75 486.75
## - SibSp 1 487.93 501.93
## - Age 1 502.50 516.50
## - Pclass 2 561.90 573.90
## - Title 3 636.47 646.47
##
## Call: glm(formula = Survived ~ Pclass + Title + Age + SibSp, family = "binomial",
## data = train)
##
## Coefficients:
## (Intercept) Pclass2 Pclass3 TitleMiss TitleMr TitleMrs
## 3.48123 -1.74922 -2.98355 1.09904 -1.46017 2.03943
## Age SibSp
## -0.05632 -0.55124
##
## Degrees of Freedom: 568 Total (i.e. Null); 561 Residual
## Null Deviance: 762
## Residual Deviance: 470.7 AIC: 486.7
model5 <- update(model4, .~. -Parch)
summary(model5)
##
## Call:
## glm(formula = Survived ~ Pclass + Title + Age + SibSp, family = "binomial",
## data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.8057 -0.5646 -0.3238 0.5677 2.5301
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 3.48123 0.58099 5.992 2.07e-09 ***
## Pclass2 -1.74922 0.34621 -5.052 4.36e-07 ***
## Pclass3 -2.98355 0.34928 -8.542 < 2e-16 ***
## TitleMiss 1.09904 0.43195 2.544 0.010947 *
## TitleMr -1.46017 0.41604 -3.510 0.000449 ***
## TitleMrs 2.03943 0.48643 4.193 2.76e-05 ***
## Age -0.05632 0.01069 -5.269 1.37e-07 ***
## SibSp -0.55124 0.14214 -3.878 0.000105 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 762.00 on 568 degrees of freedom
## Residual deviance: 470.75 on 561 degrees of freedom
## AIC: 486.75
##
## Number of Fisher Scoring iterations: 5
pchisq(762.00- 470.75, 568-561, lower.tail = F )
## [1] 4.465272e-59
predict(model2, test, type = "response") %>%
as.data.frame() %>%
rename("Prediction" = ".") %>%
mutate(pred.survival = (Prediction > 0.5)) %>%
cbind(test) %>%
select(pred.survival, Survived) %>%
mutate(accurate = (pred.survival == Survived)) %>%
group_by(accurate) %>%
count()
## # A tibble: 2 × 2
## # Groups: accurate [2]
## accurate n
## <lgl> <int>
## 1 FALSE 25
## 2 TRUE 118
predict(model5, test, type = "response") %>%
as.data.frame() %>%
rename("Prediction" = ".") %>%
mutate(pred.survival = (Prediction > 0.5)) %>%
cbind(test) %>%
select(pred.survival, Survived) %>%
mutate(accurate = (pred.survival == Survived)) %>%
group_by(accurate) %>%
count()
## # A tibble: 2 × 2
## # Groups: accurate [2]
## accurate n
## <lgl> <int>
## 1 FALSE 23
## 2 TRUE 120
# model 2
118/(25+118)
## [1] 0.8251748
# model 5
120/(120+23)
## [1] 0.8391608
Comparing with AIC, model 5 got the smallest. As AIC refers to goodness of fit of the model and penalizing the complexity of the model. The smaller the AIC the better the model. In addition to the accuracy of prediction, model5 was chosen.
predict(model3, clean, type = "response") %>%
as.data.frame() %>%
rename("Prediction" = ".") %>%
mutate(pred.survival = (Prediction > 0.5)) %>%
cbind(clean) %>%
select(pred.survival, Survived) %>%
mutate(accurate = (pred.survival == Survived)) %>%
group_by(accurate) %>%
count()
## # A tibble: 2 × 2
## # Groups: accurate [2]
## accurate n
## <lgl> <int>
## 1 FALSE 126
## 2 TRUE 586
predict(model3, clean, type = "response") %>%
as.data.frame() %>%
rename("Prediction" = ".") %>%
mutate(pred.survival = (Prediction > 0.5)) %>%
cbind(clean) %>%
select(pred.survival, Survived) %>%
mutate(accurate = (pred.survival == Survived)) %>%
group_by(accurate) %>%
count()
## # A tibble: 2 × 2
## # Groups: accurate [2]
## accurate n
## <lgl> <int>
## 1 FALSE 126
## 2 TRUE 586
visdat::vis_dat(clean, palette = "cb_safe")+
labs(title = "Cleaned Data Visualization")
clean %>%
select(Age, SibSp, Parch, Fare, Survived) %>%
cor() %>%
corrplot::corrplot()
clean %>%
ggplot(aes(x = Pclass, y = Fare, fill = Pclass))+
geom_boxplot()+
facet_wrap(~Pclass, scale ="free_y")+
theme_bw() +
scale_fill_brewer(palette = "Set1")
ggplotly()
clean %>%
group_by(Age, Sex, Survived) %>%
ggplot(aes(x = Age, fill = Survived))+
geom_histogram()+
facet_wrap(~Sex)+
labs(title = "Number of surival by Age and Gender")+
scale_fill_brewer(palette = "Set1")+
theme_test()
clean %>%
group_by(Pclass, Survived, Sex) %>%
count() %>%
ungroup() %>%
group_by(Pclass, Sex) %>%
mutate(surv_prop = n/sum(n)) %>%
ggplot(aes(x = Pclass, y = surv_prop, fill = Survived))+
geom_col()+
facet_wrap(~Sex)+
labs(title = "Number of surival by Passenger Class and Gender",
y = "Probability", x = "Class")+
scale_fill_brewer(palette="Set1")+
theme_test()
################### Data ###################
clean$Survived <- as.character(clean$Survived)
# Plot all variable, but there are too many categorical variable.
plot(clean)
# Histogram of variables
clean2 <- clean[,-c(2,3,4,9)]
clean2$Survived <-as.numeric(clean2$Survived)
clean2 %>%
select(-Survived) %>%
PerformanceAnalytics::chart.Correlation()
# Age & Sex histogram
ggplot(data=clean, mapping =aes (x=Age, fill=Sex)) +
geom_histogram(binwidth=5)+facet_wrap(~Survived)
# Fare & Sex histogram
ggplot(data=clean, mapping =aes (x=Fare, fill=Sex)) +
geom_histogram(binwidth=20)+facet_wrap(~Survived)
# SibSp histogram
ggplot(data=clean, mapping =aes (x=SibSp, fill=Survived)) +
geom_histogram(binwidth=1)
# Title barplot
ggplot(clean, aes(x=Title, fill=Survived)) +geom_bar()
# Embarked barplot
ggplot(clean, aes(x=Embarked, fill=Survived)) +geom_bar(position='dodge')
# Parch barplot
ggplot(clean, aes(x=Parch, fill=Survived)) +geom_bar()
################### Model 5 ###################
plot(model5$fitted.values, model5$residuals)
plot(train$Age, model5$residuals)
identify(train$Age , model5$residuals)
## integer(0)
plot(train$Fare , model5$residuals)
identify(train$Fare , model5$residuals)
## integer(0)
plot(train$SibSp , model5$residuals)
plot(train$Parch , model5$residuals)
# QQplot (useless for logistic regression)
qqnorm(model5$residuals)
qqline(model5$residuals)
# Residual Histogram
hist(model5$residuals)
# Residual plot
# We discovered 3 outlier which are no. obs 202, 266, 323
plot(model5$fitted.values, model5$residuals)
identify(model5$fitted.values, model5$residuals)
## integer(0)
train[c(202,266,323),]
## # A tibble: 3 × 9
## Survived Pclass Title Sex Age SibSp Parch Fare Embarked
## <lgl> <fct> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 FALSE 1 Miss female 2 1 2 152. S
## 2 TRUE 3 Mr male 39 0 0 7.92 S
## 3 FALSE 1 Mrs female 25 1 2 152. S
# obs 202 passengers ID = 298
# Name: Allison, Miss. Helen Loraine
# Age: 2
# Fare: 152
# Survived: 0
p_298 <- titanic %>% filter (PassengerId == 298)
p_298
## # A tibble: 1 × 12
## PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin
## <dbl> <dbl> <dbl> <chr> <chr> <dbl> <dbl> <dbl> <chr> <dbl> <chr>
## 1 298 0 1 Alliso… fema… 2 1 2 113781 152. C22 …
## # … with 1 more variable: Embarked <chr>
# obs 266 passengers ID = 401
p_401 <- titanic %>% filter (PassengerId == 401)
View(p_401)
# obs 323 passengers ID = 499
p_499 <- titanic %>% filter (PassengerId == 499)
View(p_298)
# Age & Sex histogram
ggplot(data=clean, mapping =aes (x=Age, fill=Sex)) +
geom_histogram(binwidth=5)+facet_wrap(~Survived) +
ggtitle("Survivor by Age & Gender")+
theme(plot.title = element_text(hjust = 0.5)) +
scale_fill_brewer(palette = "Set1")
clean$Survived = as.factor(clean$Survived)
# Fare & Sex histogram
clean %>%
ggplot(aes(x = Survived, y = Fare, fill = Sex))+
geom_boxplot()+
theme_bw() +
ggtitle("Survivors by Gender & Fare")+
theme(plot.title = element_text(hjust = 0.5)) +
scale_fill_brewer(palette = "Set1")
# Title barplot
ggplot(clean, aes(x=Title, fill=Survived)) +geom_bar()+
ggtitle("Survivors' Title")+
theme(plot.title = element_text(hjust = 0.5)) +
scale_fill_brewer(palette = "Set1")
clean %>%
group_by(SibSp, Survived, Sex) %>%
count() %>%
ungroup() %>%
group_by(SibSp, Sex) %>%
mutate(surv_prop = n/sum(n)) %>%
ggplot(aes(x = SibSp, y = surv_prop, fill = Survived))+
geom_col()+
facet_wrap(~Sex)+
labs(title = "No. of survivors by Gender and no. siblings/spouses on board",
y = "Probability", x = "No. of siblings/spouses on board")+
scale_fill_brewer(palette="Set1")+
theme_test()
clean %>%
group_by(Parch, Survived, Sex) %>%
count() %>%
ungroup() %>%
group_by(Parch, Sex) %>%
mutate(surv_prop = n/sum(n)) %>%
ggplot(aes(x = Parch, y = surv_prop, fill = Survived))+
geom_col()+
facet_wrap(~Sex)+
labs(title = "No. of survivors by Gender and no. of parents/children on board",
y = "Probability", x = "No. of parents/children on board")+
scale_fill_brewer(palette="Set1")+
theme_test()
################### Model 5 ###################
plot(model5$fitted.values, model5$residuals)
plot(train$Age, model5$residuals)
identify(train$Age , model5$residuals)
## integer(0)
plot(train$Fare , model5$residuals)
identify(train$Fare , model5$residuals)
## integer(0)
plot(train$SibSp , model5$residuals)
plot(train$Parch , model5$residuals)
# QQplot (useless for logistic regression)
qqnorm(model5$residuals)
qqline(model5$residuals)
# Residual Histogram
hist(model5$residuals)
# Residual plot
# We discovered 3 outlier which are no. obs 202, 266, 323
plot(model5$fitted.values, model5$residuals)
identify(model5$fitted.values, model5$residuals)
## integer(0)
train[c(202,266,323),]
## # A tibble: 3 × 9
## Survived Pclass Title Sex Age SibSp Parch Fare Embarked
## <lgl> <fct> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 FALSE 1 Miss female 2 1 2 152. S
## 2 TRUE 3 Mr male 39 0 0 7.92 S
## 3 FALSE 1 Mrs female 25 1 2 152. S
# obs 202 passengers ID = 298
# Name: Allison, Miss. Helen Loraine
# Age: 2
# Fare: 152
# Survived: 0